glimpse(listings)
Rows: 12,540
Columns: 74
$ id                                           <dbl> 2078, 2843, 29059, 29061,~
$ listing_url                                  <chr> "https://www.airbnb.com/r~
$ scrape_id                                    <dbl> 2.021091e+13, 2.021091e+1~
$ last_scraped                                 <date> 2021-09-15, 2021-09-15, ~
$ name                                         <chr> "Central comfy close to M~
$ description                                  <chr> "Centrally located in tre~
$ neighborhood_overview                        <chr> "Although my place is cen~
$ picture_url                                  <chr> "https://a0.muscache.com/~
$ host_id                                      <dbl> 2277, 2319, 125031, 12503~
$ host_url                                     <chr> "https://www.airbnb.com/u~
$ host_name                                    <chr> "Nelia", "Gail", "Marylin~
$ host_since                                   <date> 2008-08-19, 2008-08-19, ~
$ host_location                                <chr> "Montreal, Québec, Canada~
$ host_about                                   <chr> "Honest, quiet and adhere~
$ host_response_time                           <chr> "within an hour", "within~
$ host_response_rate                           <chr> "100%", "100%", "100%", "~
$ host_acceptance_rate                         <chr> "100%", "100%", "94%", "9~
$ host_is_superhost                            <lgl> FALSE, TRUE, FALSE, FALSE~
$ host_thumbnail_url                           <chr> "https://a0.muscache.com/~
$ host_picture_url                             <chr> "https://a0.muscache.com/~
$ host_neighbourhood                           <chr> "Le Plateau", "Little Bur~
$ host_listings_count                          <dbl> 1, 6, 2, 2, 6, 1, 7, 3, 2~
$ host_total_listings_count                    <dbl> 1, 6, 2, 2, 6, 1, 7, 3, 2~
$ host_verifications                           <chr> "['email', 'phone', 'revi~
$ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ neighbourhood                                <chr> "Montreal, Quebec, Canada~
$ neighbourhood_cleansed                       <chr> "Le Plateau-Mont-Royal", ~
$ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, N~
$ latitude                                     <dbl> 45.52859, 45.48496, 45.51~
$ longitude                                    <dbl> -73.58480, -73.58001, -73~
$ property_type                                <chr> "Private room in resident~
$ room_type                                    <chr> "Private room", "Private ~
$ accommodates                                 <dbl> 2, 2, 4, 5, 1, 5, 2, 1, 7~
$ bathrooms                                    <lgl> NA, NA, NA, NA, NA, NA, N~
$ bathrooms_text                               <chr> "1 shared bath", "1 share~
$ bedrooms                                     <dbl> 1, 1, 1, 2, 1, 2, 1, 3, 4~
$ beds                                         <dbl> 1, 1, 2, 2, 1, 3, 2, 4, 4~
$ amenities                                    <chr> "[\"Heating\", \"Hair dry~
$ price                                        <chr> "$39.00", "$40.00", "$123~
$ minimum_nights                               <dbl> 2, 1, 2, 3, 2, 4, 5, 10, ~
$ maximum_nights                               <dbl> 7, 365, 60, 21, 365, 1125~
$ minimum_minimum_nights                       <dbl> 2, 1, 2, 3, 2, 4, 5, 10, ~
$ maximum_minimum_nights                       <dbl> 2, 2, 2, 3, 2, 4, 5, 10, ~
$ minimum_maximum_nights                       <dbl> 7, 1125, 1125, 21, 1125, ~
$ maximum_maximum_nights                       <dbl> 7, 1125, 1125, 21, 1125, ~
$ minimum_nights_avg_ntm                       <dbl> 2.0, 1.0, 2.0, 3.0, 2.0, ~
$ maximum_nights_avg_ntm                       <dbl> 7, 1125, 1125, 21, 1125, ~
$ calendar_updated                             <lgl> NA, NA, NA, NA, NA, NA, N~
$ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ availability_30                              <dbl> 0, 0, 4, 4, 0, 2, 0, 12, ~
$ availability_60                              <dbl> 0, 0, 34, 21, 0, 21, 0, 4~
$ availability_90                              <dbl> 0, 0, 64, 51, 1, 51, 0, 7~
$ availability_365                             <dbl> 3, 45, 328, 306, 276, 272~
$ calendar_last_scraped                        <date> 2021-09-15, 2021-09-15, ~
$ number_of_reviews                            <dbl> 245, 152, 331, 79, 156, 4~
$ number_of_reviews_ltm                        <dbl> 0, 9, 3, 12, 3, 0, 3, 0, ~
$ number_of_reviews_l30d                       <dbl> 0, 3, 0, 4, 1, 0, 1, 0, 0~
$ first_review                                 <date> 2012-07-29, 2012-12-23, ~
$ last_review                                  <date> 2018-07-29, 2021-09-07, ~
$ review_scores_rating                         <dbl> 4.63, 4.40, 4.69, 4.61, 4~
$ review_scores_accuracy                       <dbl> 4.84, 4.52, 4.82, 4.84, 4~
$ review_scores_cleanliness                    <dbl> 4.53, 4.30, 4.70, 4.63, 4~
$ review_scores_checkin                        <dbl> 4.90, 4.62, 4.80, 4.81, 4~
$ review_scores_communication                  <dbl> 4.84, 4.74, 4.79, 4.75, 4~
$ review_scores_location                       <dbl> 4.83, 4.71, 4.79, 4.85, 4~
$ review_scores_value                          <dbl> 4.81, 4.64, 4.73, 4.66, 4~
$ license                                      <dbl> NA, NA, NA, NA, NA, NA, N~
$ instant_bookable                             <lgl> TRUE, TRUE, FALSE, FALSE,~
$ calculated_host_listings_count               <dbl> 1, 5, 2, 2, 5, 1, 7, 2, 2~
$ calculated_host_listings_count_entire_homes  <dbl> 0, 2, 2, 2, 2, 1, 7, 0, 0~
$ calculated_host_listings_count_private_rooms <dbl> 1, 3, 0, 0, 3, 0, 0, 2, 2~
$ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ reviews_per_month                            <dbl> 2.20, 1.43, 2.83, 0.69, 1~

1 Exploratory Data Analysis (EDA) of Airbnb Properties in Montreal

1.1 Initial view of the data

Initially glimpsing the data, there are 12540 observations of 74 variables, however not all of these will have an effect on price, such as the host_Url. Therefore we manually removed such varibales, along with filtering by minimum no of nights less than 4 and accomodates 2 or more people. We called this data set newlistings 1.

newlistings1 <- listings %>%
## "price": to transform from character string to a numeric
  mutate(price = parse_number(price)) %>%
## Filter data to match situation with 2 people for 4 nights in airbnb
  filter(minimum_nights <= 4) %>%
  filter(accommodates >= 2) %>%

## "prop_type_simplified": to narrow down categories into 5  
  mutate(Property_Type = case_when(
    property_type %in% c(
      "Entire rental unit",
      "Private room in rental unit",
      "Entire condominium (condo)",
      "Entire loft"
    ) ~ property_type,
    TRUE ~ "Other"
  ))


## delete those non-numeric variable which will not be used in regression
newlistings1 = newlistings1[-c(1,
                               3:12,
                               13:15,
                               19:21,
                               23,
                               27:29,
                               30:32,
                               35,
                               43:50,
                               53:55,
                               59,
                               60,
                               68,
                               70:73)]

glimpse(newlistings1)
Rows: 7,400
Columns: 32
$ listing_url                 <chr> "https://www.airbnb.com/rooms/2078", "http~
$ host_response_rate          <chr> "100%", "100%", "100%", "100%", "100%", "N~
$ host_acceptance_rate        <chr> "100%", "100%", "94%", "94%", "0%", "N/A",~
$ host_is_superhost           <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, T~
$ host_listings_count         <dbl> 1, 6, 2, 2, 1, 2, 1, 1, 1, 0, 7, 7, 1, 0, ~
$ host_verifications          <chr> "['email', 'phone', 'reviews', 'manual_off~
$ host_has_profile_pic        <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ~
$ host_identity_verified      <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ~
$ room_type                   <chr> "Private room", "Private room", "Entire ho~
$ accommodates                <dbl> 2, 2, 4, 5, 5, 7, 4, 3, 2, 8, 2, 3, 4, 2, ~
$ bathrooms_text              <chr> "1 shared bath", "1 shared bath", "1 bath"~
$ bedrooms                    <dbl> 1, 1, 1, 2, 2, 4, 1, 1, 1, 4, 1, 1, 2, NA,~
$ beds                        <dbl> 1, 1, 2, 2, 3, 4, 2, 0, 1, 4, 0, 2, 2, 1, ~
$ amenities                   <chr> "[\"Heating\", \"Hair dryer\", \"Iron\", \~
$ price                       <dbl> 39, 40, 123, 286, 140, 63, 75, 53, 150, 25~
$ minimum_nights              <dbl> 2, 1, 2, 3, 4, 3, 3, 1, 3, 3, 3, 3, 3, 1, ~
$ maximum_nights              <dbl> 7, 365, 60, 21, 1125, 365, 31, 365, 730, 1~
$ availability_30             <dbl> 0, 0, 4, 4, 2, 21, 2, 0, 0, 11, 0, 0, 23, ~
$ availability_60             <dbl> 0, 0, 34, 21, 21, 51, 5, 13, 0, 11, 0, 0, ~
$ number_of_reviews           <dbl> 245, 152, 331, 79, 4, 20, 152, 80, 0, 24, ~
$ number_of_reviews_ltm       <dbl> 0, 9, 3, 12, 0, 0, 31, 0, 0, 1, 6, 3, 0, 4~
$ number_of_reviews_l30d      <dbl> 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ~
$ review_scores_rating        <dbl> 4.63, 4.40, 4.69, 4.61, 4.67, 4.47, 4.93, ~
$ review_scores_accuracy      <dbl> 4.84, 4.52, 4.82, 4.84, 4.67, 4.60, 4.93, ~
$ review_scores_cleanliness   <dbl> 4.53, 4.30, 4.70, 4.63, 5.00, 4.10, 4.91, ~
$ review_scores_checkin       <dbl> 4.90, 4.62, 4.80, 4.81, 5.00, 4.95, 4.96, ~
$ review_scores_communication <dbl> 4.84, 4.74, 4.79, 4.75, 5.00, 4.80, 4.97, ~
$ review_scores_location      <dbl> 4.83, 4.71, 4.79, 4.85, 4.00, 4.40, 4.78, ~
$ review_scores_value         <dbl> 4.81, 4.64, 4.73, 4.66, 4.67, 4.55, 4.92, ~
$ instant_bookable            <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRU~
$ reviews_per_month           <dbl> 2.20, 1.43, 2.83, 0.69, 0.03, 0.51, 1.28, ~
$ Property_Type               <chr> "Other", "Private room in rental unit", "E~

1.2 Data cleaning & wrangling

A glimpse of data newlistings1, reveals we now have 7400 observations with 32 variables, this is much more accessible date, however we noticed that there are a few price anomalies. Therefore using the confidence interval function of price, we removed the upper and lower 5% outliers in a new df, newlistings 2. Along with this we mutated some of the variables for example, converting host_verifications to numbers and making host acceptance rate to percentage.

## to get anomalous prices with upper 5% and lower 5%
## to get upper 5% and lower 5%, [30,350]
confint(newlistings1$price,mean,level=0.9)
5%95%
30350
newlistings2 <- newlistings1 %>%
  
  ## to delete outliers in price with upper and lower 5%
  filter(price <= 350, price >=30)  %>%
  
  ## "host_verifications": to count the total number of items included
  mutate(host_verifications = stringr::str_count(host_verifications, ',') + 1) %>%
  
  ## "amenities": to count the total number of items included
  mutate(amenities = stringr::str_count(amenities, ',') + 1) %>%
  
  ## "host_response_rate": to transform from character string to a numeric
  mutate(host_response_rate = as.numeric(sub("%", "", host_response_rate)) /
           100) %>%
  
  ## "host_acceptance_rate": to transform from character string to a numeric  
  mutate(host_acceptance_rate = as.numeric(sub("%", "", host_acceptance_rate)) /
           100) %>%
  
  ## create new variable checking if there are shared bathrooms or not
  mutate(shared_bathroom = grepl("shared", bathrooms_text, fixed = TRUE)) %>%
  
  ## convert "bathroom text" into a numeric
  mutate(bathrooms_text = as.numeric(sapply(strsplit(bathrooms_text, " "), "[[", 1))) %>% 
  
   # "price_4_nights": to create total cost for 4 nights  
  mutate(price_4_nights = price*4) %>%
  
  # "log_price_4_nights": to see whether "log" processing is better doing regression if skewness exists
  mutate(log_price_4_nights=log(price_4_nights))

newlistings3 <- newlistings2

1.3 Prepare for visualization

Now we can start to use EDA on this new dataframe, Step 1, Glimpsing the data:

glimpse(newlistings2)
Rows: 6,674
Columns: 35
$ listing_url                 <chr> "https://www.airbnb.com/rooms/2078", "http~
$ host_response_rate          <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, NA, 1.00, NA~
$ host_acceptance_rate        <dbl> 1.00, 1.00, 0.94, 0.94, 0.00, NA, 1.00, 0.~
$ host_is_superhost           <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, T~
$ host_listings_count         <dbl> 1, 6, 2, 2, 1, 2, 1, 1, 1, 0, 7, 7, 1, 0, ~
$ host_verifications          <dbl> 6, 7, 8, 8, 5, 5, 8, 5, 3, 4, 5, 5, 4, 6, ~
$ host_has_profile_pic        <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ~
$ host_identity_verified      <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ~
$ room_type                   <chr> "Private room", "Private room", "Entire ho~
$ accommodates                <dbl> 2, 2, 4, 5, 5, 7, 4, 3, 2, 8, 2, 3, 4, 2, ~
$ bathrooms_text              <dbl> 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, NA~
$ bedrooms                    <dbl> 1, 1, 1, 2, 2, 4, 1, 1, 1, 4, 1, 1, 2, NA,~
$ beds                        <dbl> 1, 1, 2, 2, 3, 4, 2, 0, 1, 4, 0, 2, 2, 1, ~
$ amenities                   <dbl> 14, 47, 32, 38, 23, 28, 52, 40, 2, 40, 22,~
$ price                       <dbl> 39, 40, 123, 286, 140, 63, 75, 53, 150, 25~
$ minimum_nights              <dbl> 2, 1, 2, 3, 4, 3, 3, 1, 3, 3, 3, 3, 3, 1, ~
$ maximum_nights              <dbl> 7, 365, 60, 21, 1125, 365, 31, 365, 730, 1~
$ availability_30             <dbl> 0, 0, 4, 4, 2, 21, 2, 0, 0, 11, 0, 0, 23, ~
$ availability_60             <dbl> 0, 0, 34, 21, 21, 51, 5, 13, 0, 11, 0, 0, ~
$ number_of_reviews           <dbl> 245, 152, 331, 79, 4, 20, 152, 80, 0, 24, ~
$ number_of_reviews_ltm       <dbl> 0, 9, 3, 12, 0, 0, 31, 0, 0, 1, 6, 3, 0, 4~
$ number_of_reviews_l30d      <dbl> 0, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ~
$ review_scores_rating        <dbl> 4.63, 4.40, 4.69, 4.61, 4.67, 4.47, 4.93, ~
$ review_scores_accuracy      <dbl> 4.84, 4.52, 4.82, 4.84, 4.67, 4.60, 4.93, ~
$ review_scores_cleanliness   <dbl> 4.53, 4.30, 4.70, 4.63, 5.00, 4.10, 4.91, ~
$ review_scores_checkin       <dbl> 4.90, 4.62, 4.80, 4.81, 5.00, 4.95, 4.96, ~
$ review_scores_communication <dbl> 4.84, 4.74, 4.79, 4.75, 5.00, 4.80, 4.97, ~
$ review_scores_location      <dbl> 4.83, 4.71, 4.79, 4.85, 4.00, 4.40, 4.78, ~
$ review_scores_value         <dbl> 4.81, 4.64, 4.73, 4.66, 4.67, 4.55, 4.92, ~
$ instant_bookable            <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRU~
$ reviews_per_month           <dbl> 2.20, 1.43, 2.83, 0.69, 0.03, 0.51, 1.28, ~
$ Property_Type               <chr> "Other", "Private room in rental unit", "E~
$ shared_bathroom             <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FAL~
$ price_4_nights              <dbl> 156, 160, 492, 1144, 560, 252, 300, 212, 6~
$ log_price_4_nights          <dbl> 5.049856, 5.075174, 6.198479, 7.042286, 6.~
favstats(~price, data = newlistings2)
minQ1medianQ3maxmeansdnmissing
30629514035011064.766740

Here we see there are now 6674 observation due to removing the price anomalies, and there are 35 variables, as we have added back in shared_bathrooms. Using favstat we can see that the mean has decreased slightly, however the SD has decreased by 4x.

skimr::skim(newlistings2)
Data summary
Name newlistings2
Number of rows 6674
Number of columns 35
_______________________
Column type frequency:
character 3
logical 5
numeric 27
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1 33 37 0 6674 0
room_type 0 1 10 15 0 4 0
Property_Type 0 1 5 27 0 5 0

Variable type: logical

skim_variable n_missing complete_rate mean count
host_is_superhost 2 1 0.19 FAL: 5435, TRU: 1237
host_has_profile_pic 2 1 0.99 TRU: 6636, FAL: 36
host_identity_verified 2 1 0.81 TRU: 5386, FAL: 1286
instant_bookable 0 1 0.44 FAL: 3722, TRU: 2952
shared_bathroom 0 1 0.17 FAL: 5569, TRU: 1105

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
host_response_rate 3129 0.53 0.89 0.24 0.00 0.93 1.00 1.00 1.00 ▁▁▁▁▇
host_acceptance_rate 2904 0.56 0.83 0.30 0.00 0.83 0.98 1.00 1.00 ▁▁▁▁▇
host_listings_count 2 1.00 8.32 21.93 0.00 1.00 1.00 4.00 341.00 ▇▁▁▁▁
host_verifications 0 1.00 5.14 2.24 1.00 3.00 6.00 7.00 12.00 ▆▅▇▃▁
accommodates 0 1.00 3.52 2.04 2.00 2.00 3.00 4.00 16.00 ▇▂▁▁▁
bathrooms_text 8 1.00 1.11 0.40 0.00 1.00 1.00 1.00 16.00 ▇▁▁▁▁
bedrooms 716 0.89 1.50 0.84 1.00 1.00 1.00 2.00 16.00 ▇▁▁▁▁
beds 68 0.99 1.75 1.30 0.00 1.00 1.00 2.00 50.00 ▇▁▁▁▁
amenities 0 1.00 24.34 11.52 1.00 15.00 23.00 32.00 71.00 ▅▇▅▁▁
price 0 1.00 110.08 64.69 30.00 62.00 95.00 140.00 350.00 ▇▅▂▁▁
minimum_nights 0 1.00 1.81 0.85 1.00 1.00 2.00 2.00 4.00 ▇▇▁▂▁
maximum_nights 0 1.00 677.06 511.79 1.00 32.00 1125.00 1125.00 3000.00 ▆▇▁▁▁
availability_30 0 1.00 5.45 8.63 0.00 0.00 0.00 9.00 30.00 ▇▁▁▁▁
availability_60 0 1.00 15.10 20.23 0.00 0.00 0.00 32.00 60.00 ▇▁▁▂▂
number_of_reviews 0 1.00 26.42 53.26 0.00 1.00 7.00 25.00 611.00 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 5.41 11.75 0.00 0.00 0.00 6.00 130.00 ▇▁▁▁▁
number_of_reviews_l30d 0 1.00 1.03 2.01 0.00 0.00 0.00 1.00 24.00 ▇▁▁▁▁
review_scores_rating 1105 0.83 4.55 0.84 0.00 4.50 4.78 4.99 5.00 ▁▁▁▁▇
review_scores_accuracy 1226 0.82 4.73 0.48 1.00 4.69 4.88 5.00 5.00 ▁▁▁▁▇
review_scores_cleanliness 1225 0.82 4.61 0.57 1.00 4.50 4.80 5.00 5.00 ▁▁▁▁▇
review_scores_checkin 1229 0.82 4.81 0.44 1.00 4.80 4.95 5.00 5.00 ▁▁▁▁▇
review_scores_communication 1226 0.82 4.80 0.45 1.00 4.79 4.95 5.00 5.00 ▁▁▁▁▇
review_scores_location 1230 0.82 4.78 0.38 1.00 4.71 4.90 5.00 5.00 ▁▁▁▁▇
review_scores_value 1230 0.82 4.66 0.50 1.00 4.57 4.79 4.97 5.00 ▁▁▁▁▇
reviews_per_month 1105 0.83 1.58 2.52 0.01 0.15 0.70 2.05 60.00 ▇▁▁▁▁
price_4_nights 0 1.00 440.34 258.74 120.00 248.00 380.00 560.00 1400.00 ▇▅▂▁▁
log_price_4_nights 0 1.00 5.93 0.57 4.79 5.51 5.94 6.33 7.24 ▃▆▇▅▂

Using this, we can see; there are 1 character variable, 5 logics and 27 numeric variables.

1.4 Plot1: Scatter Plot to Show Price against Amenities

#Plot 1
ggplot(newlistings2, aes(x = price, y = amenities, colour = Property_Type)) + #Plotting a scatter plot of price against amenities and classifying the points by property type
  geom_point() +
  labs (#Axis Titles and Labels
    x = "Price",
    y = "Amenities") +
  ggtitle("Scatter Plot to Show Price against Amenities") +
  theme_bw()

The above plot shows the price of the AirBnB against the amenities they offer, coloured by property type. From the graph it shows little correlation between price and amenities. We beleive this is the case because the price of the amenities are fairly low, such as hair dryers, heating, bed linen etc, therefore no matter if the price of the accomadation is low it is still faesible to afford the amenities. The plot does show that the majoirty of the properties have a price less than $200.

However there is a correlation between price and private rooms. These are all clustered in the <£200 section with a few outliers, this could be due to the lack of desire to share a house with others and therefore it drives the price of the property down.

1.5 Plot 2: Scatter Plot to Show Log Price against Amenities

#Plot 2
ggplot(newlistings2, aes(x = price, y = amenities, colour = Property_Type)) + #Plotting a scatter plot of price against amenities and classifying the points by property type
  geom_point() +
  scale_x_log10(labels = scales::comma) +
  labs (#Axis Titles and Labels
    x = "Price",
    y = "Amenities") +
  ggtitle("Scatter Plot to Show Log Price against Amenities") +
  theme_bw()

This plot is the same as the above however we have decided to use log of price as it produces a more normal distribution, it is here a lot clearer to see the shared rooms are the most affordable ones. We will continue therefore to use log price from now on.

1.6 Plot 3: Histogram to Show Log Price

#Plot 3
ggplot(newlistings2, aes(x = price, alpha = 0.6)) + #Plotting a histogram of price
  geom_histogram(aes(y = ..density..), colour = "black", fill = "blue") +
  #scale_x_log10(labels = scales::comma) + #Changing to Log Scale
  labs (#Axis Titles and Labels
    x = "Price",
    y = "Frequency Density") +
  ggtitle("Histogram to Show Price") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(price)),col='red',size=2, alpha = 0.6)

ggplot(newlistings2, aes(x = price, alpha = 0.6)) + #Plotting a histogram of price
  geom_histogram(aes(y = ..density..), colour = "black", fill = "blue") +
  scale_x_log10(labels = scales::comma) + #Changing to Log Scale
  labs (#Axis Titles and Labels
    x = "Log_Price",
    y = "Frequency Density") +
  ggtitle("Histogram to Show Log Price") +
  theme_bw() +
  geom_vline(aes(xintercept = mean(price)),col='red',size=2, alpha = 0.6)

The above plots shows a histogram for the price of the properties one with logscale one without. It is clear from the first graph there is positively skewed distrubution, and by using the log scale it normalises it. This is better for regression models. In both plots we can use the mean to see the mean price is around £115.

1.7 Plot 4: Density Plot to Show Review Rating

#Plot 4
ggplot(newlistings2, aes(x = review_scores_rating)) + #Plotting a density plot for Amenities
  geom_density(aes(fill = "pink", alpha = 0.5)) +
  labs (#Axis Titles and Labels
    x = "Review Rating",
    y = "Frequency Density") +
  ggtitle("Density Plot to Show Review Rating") +
  theme(legend.position = "none") +
  theme_bw()

This plot shows a great negative skew to the 5 star rating, i.e. the majority of ratings are 5. As the ratings are high, before doing any analysis i could hypothesise that this may mean reviews don’t have much bearing on price because, the majority of reviews are 5 star, however not all the prices are high.

1.8 Plot 5: Boxplot to Show Super Hosts to Price

#Plot 5
superhost <- newlistings3[c(4,15)] #removing na's from superhost variable
superhost1 <- na.omit(superhost)

ggplot(superhost1, aes(x=price, y = host_is_superhost, fill = host_is_superhost)) + #Plotting a density plot for Amenities
  geom_boxplot() +
  scale_x_log10(labels = scales::comma) +
  labs (#Axis Titles and Labels
    x = "Price",
    y = "Super Host") +
  ggtitle("Boxplot to Show Super Hosts to Price") +
  theme(legend.position = NULL) +
  theme_bw()

The boxplot shows super host to log price, cleary the mean price for the superhosts are greater than that of those who aren’t. I would suggest this is because these hosts are seen to be superior and therefore can charge more, especially if this correlates with having more amenities, better response time etc. Surprisngly however the most expensive property is for someone without a superhost, but the superhosts have a greater minimum price because they know they can charge more for the property. The IQR and range is smaller for superhosts, this could be because they know how much they can charge, compared to normal hosts who may have less experience with airbnb.

1.9 Plot 6: Correlation between each variables

#Plot 6 
scatterplot <- newlistings2[-c(1:3, #creating a new scatter plot df of variables that the full linear model found to be the most statistically significant.
                               4:7,
                               11:12,
                               14:16,
                               17:21,
                               22:28,
                               29:30,
                               34
                               )]
GGally::ggpairs(scatterplot) # Running GG pairs to check relationships between variables

Using ggpairs I can see the correlations between variables. The greatest correlation is number of bedrooms and number the property sleeps. This is expected as the more rooms the more the property can sleep. The 2nd highest correlation is between review value and review location, this must be because there nicer the location the greater the review. This is expected, better locations better reviews. Correlations between availability and reviews are negatively correlated slightly. This suggest the more availability the worse the review, which makes sense. If the property is free more often this could be because it isn’t the best property which would bring with it worse reviews.

Accommodates, bathrooms, bedrooms, price, availability all have a positive skew, whilst reviews have a negative skew.

1.10 Mapping

Visualisations of feature distributions and their relations are key to understanding a data set, and they can open up new lines of exploration. While we do not have time to go into all the wonderful geospatial visualisations one can do with R, you can use the following code to start with a map of your city, and overlay all AirBnB coordinates to get an overview of the spatial distribution of AirBnB rentals. For this visualisation we use the leaflet package, which includes a variety of tools for interactive maps, so you can easily zoom in-out, click on a point to get the actual AirBnB listing for that specific point, etc.

The following code, having downloaded a dataframe listings with all AirbnB listings in Milan, will plot on the map all AirBnBs where minimum_nights is less than equal to four (4). You could learn more about leaflet, by following the relevant Datacamp course on mapping with leaflet

leaflet(data = filter(listings, minimum_nights <= 4)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

2 Regression Analysis

2.1 Split data into training and testing sets

library(rsample)

set.seed(1234)

# split the data into 2 parts
## randomly 70% for model regression, 30% for estimation based on the final optimized model 
newlistings3 <- initial_split(newlistings3, prop=0.70)
airbnb_train <- training(newlistings3)
airbnb_test <- testing(newlistings3)

2.2 Skewness of “price_4_nights”

# Distribution of "price_4_nights" in airbnb_test
ggplot(airbnb_train,aes(x=price_4_nights))+
  geom_density()+
  labs (
    x = "Price",
    y = "Density") +
  ggtitle("Distribution of price for 4 nights in airbnb_test") +
  theme_bw()

# Distribution of "log_price_4_nights" in airbnb_test
ggplot(airbnb_train,aes(x=log_price_4_nights))+
  geom_density()+
  labs (
    x = "Price",
    y = "Density") +
  ggtitle("Distribution of price for 4 nights calculated in log() in airbnb_test") +
  theme_bw()

We can see from the above graphs that the distribution is clearly left skewed. Hence, to normalize the distribution, we decided to log-transform the dependent variables to log_price_4_nights in our further model building.

2.3 Model 1

# regression model with 3 explanatory variables: 
## prop_type_simplified, number_of_reviews, review_scores_rating
model1 <- lm(log_price_4_nights ~ Property_Type+number_of_reviews+review_scores_rating, data = airbnb_train)
mosaic::msummary(model1)
                                           Estimate Std. Error t value Pr(>|t|)
(Intercept)                               6.0900749  0.0502419 121.215  < 2e-16
Property_TypeEntire loft                  0.0120858  0.0434308   0.278    0.781
Property_TypeEntire rental unit          -0.1543305  0.0279965  -5.512 3.77e-08
Property_TypeOther                       -0.1846103  0.0337508  -5.470 4.79e-08
Property_TypePrivate room in rental unit -0.8393568  0.0330639 -25.386  < 2e-16
number_of_reviews                         0.0005848  0.0001412   4.140 3.54e-05
review_scores_rating                      0.0119647  0.0093088   1.285    0.199
                                            
(Intercept)                              ***
Property_TypeEntire loft                    
Property_TypeEntire rental unit          ***
Property_TypeOther                       ***
Property_TypePrivate room in rental unit ***
number_of_reviews                        ***
review_scores_rating                        

Residual standard error: 0.4987 on 3915 degrees of freedom
  (749 observations deleted due to missingness)
Multiple R-squared:  0.222, Adjusted R-squared:  0.2208 
F-statistic: 186.2 on 6 and 3915 DF,  p-value: < 2.2e-16
autoplot(model1)

(log⁡(“price_4_nights” )&=β_0+β_1דEntire_Loft” +β_2דEntire_Rental_Unit” +β_3דPrivate_Room” @&“+” β_4דOther_Property_Type +” β_5דNumber_of_Reviews” +β_6דReview_Score”

1.1 Property Type: In our regression result, intercept exp⁡(β_0 )=exp⁡(6.0901)≈441.4656 indicates that price_4_nights will be 440 times higher for Entire_Condo when Number_of_Reviews and Review_Score equals zero. Although this intercept seems statistically significant given 95% confidence level, it is not very meaningful due to the conditions attached.

exp⁡(β_1 )-1=exp⁡(.0121)-1≈0.0122. This means price_4_nights will be 1.22% higher for property type of Entire_Loft than for property type that is not Entire_Loft, holding all else equal. However, p_value of 0.781 suggests that this coefficient estimate is not statistically significant.

〖1-exp〗⁡(β_2 )=1-exp⁡(.-1543)≈0.143. We expect that price_4_nights will be 14.3% lower for Entire_Rental_Unit than for property type that is not Entire_Rental_Unit, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

〖1-exp〗⁡(β_3 )=〖1-exp〗⁡(.8394)≈0.568. We expect that price_4_nights will be 56.8% lower for Private_Room than for property type that is not Private_Room, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

〖1-exp〗⁡(β_4 )=〖1-exp〗⁡(.1846)≈0.1686. We expect t¬hat price_4_nights will be 16.86% lower for Other_Property_Type than for property type that is not Other_Property_Type, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

1.2 Number of Reviews: exp⁡(β_5 )-1=exp⁡(.0001)-1≈0.0001. This means for every unit increase in Number_of_Reviews, price_4_nights will only increase by 1 basis point, holding all else equal. p_value of the coefficient estimate is lower than 0.05, therefore we can conclude that this is statistically significant at 95% confidence level. However, this estimate is not practically significant since 1 basis point increase in price is too small compared one unit increase in number of reviews, which are generally not a large number.

1.3 Review Score Rating: exp⁡(β_6 )-1=exp⁡〖(.012)-1〗≈0.012. This means for every unit increase in Review_Score, price_4_nights will increase by 1.2%, holding all else equal. However, looking at p_value of the coefficient estimate, we can conclude that this is not statistically significant at 95% confidence level.

1.4 Overall Model: From the F-stat we know the overall model is statistically significant at 95% confidence level and we conclude that model 1 fits the sample data better than the model without independent variables. In addition, adjusted R-squared of 0.22 suggests that only 22% of the data fits the model, which is considered very weak.

2.4 Model 2

# regression model with 4 explanatory variables: 
## prop_type_simplified, number_of_reviews, review_scores_rating, room_type
model2 <- lm(log_price_4_nights ~ Property_Type+number_of_reviews+review_scores_rating+room_type, data = airbnb_train)
mosaic::msummary(model2)
                                           Estimate Std. Error t value Pr(>|t|)
(Intercept)                               6.1060189  0.0484742 125.964  < 2e-16
Property_TypeEntire loft                  0.0116261  0.0418861   0.278  0.78136
Property_TypeEntire rental unit          -0.1547214  0.0270008  -5.730 1.08e-08
Property_TypeOther                        0.1807273  0.0420752   4.295 1.79e-05
Property_TypePrivate room in rental unit -0.1689897  0.0543465  -3.109  0.00189
number_of_reviews                         0.0006017  0.0001362   4.416 1.03e-05
review_scores_rating                      0.0084180  0.0089828   0.937  0.34875
room_typeHotel room                       0.2835691  0.0941446   3.012  0.00261
room_typePrivate room                    -0.6707124  0.0440211 -15.236  < 2e-16
room_typeShared room                     -0.8306064  0.1639074  -5.068 4.22e-07
                                            
(Intercept)                              ***
Property_TypeEntire loft                    
Property_TypeEntire rental unit          ***
Property_TypeOther                       ***
Property_TypePrivate room in rental unit ** 
number_of_reviews                        ***
review_scores_rating                        
room_typeHotel room                      ** 
room_typePrivate room                    ***
room_typeShared room                     ***

Residual standard error: 0.481 on 3912 degrees of freedom
  (749 observations deleted due to missingness)
Multiple R-squared:  0.2769,    Adjusted R-squared:  0.2753 
F-statistic: 166.5 on 9 and 3912 DF,  p-value: < 2.2e-16
autoplot(model2)

(log⁡(“price_4_nights” )&=β_0+β_1דEntire_Loft” +β_2דEntire_Rental_Unit” +β_3דPrivate_Room” @&“+” β_4דOther_Property_Type +” β_5דNumber_of_Reviews” +β_6דReview_Score” @&“+” β_7דHotel+” β_8דPrivate+” β_9דShared”

2.1 Room Type: Intercept reflects is less interesting here as it requires Number_of_Reviews and Review_Score equals zero, which is not useful in reality.

exp⁡〖(β_7 )-1〗=exp⁡(.2836)-1≈0.3279. This means price_4_nights will be 32.79% higher for room type of Hotel than for room type that is not Hotel, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

〖1-exp〗⁡(β_8 )=〖1-exp〗⁡(.6707)≈0.4886. We expect that price_4_nights will be 48.86% lower for room type of Private than for room type that is not Private, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

〖1-exp〗⁡(β_9 )=〖1-exp〗⁡(.8306)≈0.5642. We expect that price_4_nights will be 56.42% lower for room type of Shared than for room type that is not Shared, holding all else equal. This coefficient estimate is statistically significant at 95% confidence level, as p_value falls below 0.05.

2.2 Overall Model: From the F-stat we know the overall model is statistically significant at 95% confidence level and we conclude that model 1 fits the sample data better than the model without independent variables. In addition, adjusted R-squared of 0.28 suggests that only 28% of the data fits the model, which is considered very weak, similar to the previous model. All three room types seems to be both statistically significant and practically significant, and we decide to incorporate these factors into our further model building.

2.5 Model 3&4

# remove the 3 irrelevant variables
new_airbnb_train <- airbnb_train %>% 
  select(-price_4_nights, -price, -listing_url)

# regression model with all variables: 
model3 <- lm(log_price_4_nights ~ ., data = new_airbnb_train)
mosaic::msummary(model3)
                                           Estimate Std. Error t value Pr(>|t|)
(Intercept)                               4.625e+00  1.883e-01  24.565  < 2e-16
host_response_rate                        9.582e-03  5.386e-02   0.178 0.858823
host_acceptance_rate                      8.312e-03  4.369e-02   0.190 0.849148
host_is_superhostTRUE                     3.825e-02  2.142e-02   1.786 0.074326
host_listings_count                       2.212e-04  5.601e-04   0.395 0.692964
host_verifications                        4.462e-03  4.605e-03   0.969 0.332688
host_has_profile_picTRUE                  4.632e-02  1.149e-01   0.403 0.686834
host_identity_verifiedTRUE                1.018e-01  3.307e-02   3.079 0.002112
room_typeHotel room                       2.837e-01  9.589e-02   2.959 0.003127
room_typePrivate room                    -2.504e-01  5.324e-02  -4.702 2.77e-06
room_typeShared room                     -3.757e-01  1.658e-01  -2.265 0.023610
accommodates                              3.832e-02  7.120e-03   5.382 8.36e-08
bathrooms_text                            1.194e-01  2.581e-02   4.625 4.01e-06
bedrooms                                  1.052e-01  1.625e-02   6.474 1.23e-10
beds                                     -9.800e-03  1.327e-02  -0.738 0.460312
amenities                                 3.370e-03  8.153e-04   4.133 3.74e-05
minimum_nights                            2.468e-02  1.187e-02   2.079 0.037762
maximum_nights                            9.363e-06  1.808e-05   0.518 0.604568
availability_30                           1.601e-02  2.321e-03   6.895 7.45e-12
availability_60                           6.126e-04  1.038e-03   0.590 0.555200
number_of_reviews                         1.111e-05  1.573e-04   0.071 0.943665
number_of_reviews_ltm                    -1.500e-03  8.998e-04  -1.667 0.095656
number_of_reviews_l30d                    2.715e-03  5.031e-03   0.540 0.589589
review_scores_rating                      1.905e-01  5.527e-02   3.446 0.000582
review_scores_accuracy                   -1.153e-02  4.809e-02  -0.240 0.810509
review_scores_cleanliness                 1.005e-01  3.251e-02   3.091 0.002023
review_scores_checkin                    -2.935e-02  4.060e-02  -0.723 0.469795
review_scores_communication              -7.399e-02  3.904e-02  -1.895 0.058193
review_scores_location                    2.321e-01  3.356e-02   6.917 6.42e-12
review_scores_value                      -2.538e-01  4.959e-02  -5.118 3.43e-07
instant_bookableTRUE                     -4.917e-02  2.082e-02  -2.362 0.018302
reviews_per_month                         1.724e-03  3.404e-03   0.506 0.612651
Property_TypeEntire loft                  1.476e-01  4.735e-02   3.118 0.001851
Property_TypeEntire rental unit          -8.139e-02  2.879e-02  -2.827 0.004745
Property_TypeOther                       -1.390e-02  4.502e-02  -0.309 0.757513
Property_TypePrivate room in rental unit -1.071e-01  6.105e-02  -1.754 0.079519
shared_bathroomTRUE                      -3.740e-01  4.405e-02  -8.492  < 2e-16
                                            
(Intercept)                              ***
host_response_rate                          
host_acceptance_rate                        
host_is_superhostTRUE                    .  
host_listings_count                         
host_verifications                          
host_has_profile_picTRUE                    
host_identity_verifiedTRUE               ** 
room_typeHotel room                      ** 
room_typePrivate room                    ***
room_typeShared room                     *  
accommodates                             ***
bathrooms_text                           ***
bedrooms                                 ***
beds                                        
amenities                                ***
minimum_nights                           *  
maximum_nights                              
availability_30                          ***
availability_60                             
number_of_reviews                           
number_of_reviews_ltm                    .  
number_of_reviews_l30d                      
review_scores_rating                     ***
review_scores_accuracy                      
review_scores_cleanliness                ** 
review_scores_checkin                       
review_scores_communication              .  
review_scores_location                   ***
review_scores_value                      ***
instant_bookableTRUE                     *  
reviews_per_month                           
Property_TypeEntire loft                 ** 
Property_TypeEntire rental unit          ** 
Property_TypeOther                          
Property_TypePrivate room in rental unit .  
shared_bathroomTRUE                      ***

Residual standard error: 0.3703 on 1770 degrees of freedom
  (2864 observations deleted due to missingness)
Multiple R-squared:  0.5379,    Adjusted R-squared:  0.5285 
F-statistic: 57.22 on 36 and 1770 DF,  p-value: < 2.2e-16
# check colinearity
car::vif(model3)
                                GVIF Df GVIF^(1/(2*Df))
host_response_rate          1.804948  1        1.343484
host_acceptance_rate        2.112224  1        1.453349
host_is_superhost           1.236345  1        1.111910
host_listings_count         1.268525  1        1.126288
host_verifications          1.339502  1        1.157369
host_has_profile_pic        1.051864  1        1.025604
host_identity_verified      1.260535  1        1.122735
room_type                   7.816388  3        1.408751
accommodates                3.694081  1        1.921999
bathrooms_text              1.341229  1        1.158115
bedrooms                    2.654212  1        1.629175
beds                        4.006709  1        2.001677
amenities                   1.202004  1        1.096360
minimum_nights              1.237510  1        1.112434
maximum_nights              1.065481  1        1.032222
availability_30             5.699080  1        2.387275
availability_60             5.696394  1        2.386712
number_of_reviews           1.439811  1        1.199921
number_of_reviews_ltm       1.993903  1        1.412056
number_of_reviews_l30d      1.929893  1        1.389206
review_scores_rating        7.826038  1        2.797506
review_scores_accuracy      5.143473  1        2.267923
review_scores_cleanliness   3.372706  1        1.836493
review_scores_checkin       3.041157  1        1.743891
review_scores_communication 2.874017  1        1.695293
review_scores_location      1.766853  1        1.329230
review_scores_value         5.756587  1        2.399289
instant_bookable            1.420687  1        1.191926
reviews_per_month           1.404498  1        1.185115
Property_Type               6.467199  4        1.262814
shared_bathroom             3.209166  1        1.791415
autoplot(model3)

after running a VIF, we are able to determine which variables have a higher> 5, meaning that we found variables that are strongly correlated with each other.We will now remove the strongest correlated variables, including room_type, availability_60, review_scores_accuracy, review_scores_value, review_scores_rating

2.6 Model 4

# create a new data set excluding several strongly correlated variables
model_4_training<- new_airbnb_train %>% 
  select(-room_type, -availability_60, -review_scores_accuracy, -review_scores_rating, -number_of_reviews_l30d, -number_of_reviews_ltm)

# regression model with dataset model_4_training
model4 <- lm(log_price_4_nights ~ ., data = model_4_training)
mosaic::msummary(model4)
                                           Estimate Std. Error t value Pr(>|t|)
(Intercept)                               4.525e+00  1.894e-01  23.895  < 2e-16
host_response_rate                        6.356e-03  5.428e-02   0.117  0.90680
host_acceptance_rate                      9.345e-03  4.382e-02   0.213  0.83114
host_is_superhostTRUE                     4.816e-02  2.142e-02   2.248  0.02467
host_listings_count                       5.215e-04  5.629e-04   0.926  0.35437
host_verifications                        5.614e-03  4.635e-03   1.211  0.22596
host_has_profile_picTRUE                  2.890e-02  1.160e-01   0.249  0.80331
host_identity_verifiedTRUE                9.997e-02  3.350e-02   2.984  0.00288
accommodates                              4.180e-02  7.151e-03   5.845 6.00e-09
bathrooms_text                            1.222e-01  2.609e-02   4.683 3.04e-06
bedrooms                                  1.180e-01  1.629e-02   7.248 6.30e-13
beds                                     -1.576e-02  1.321e-02  -1.193  0.23313
amenities                                 3.600e-03  8.182e-04   4.400 1.14e-05
minimum_nights                            2.954e-02  1.169e-02   2.526  0.01161
maximum_nights                            6.570e-06  1.813e-05   0.362  0.71714
availability_30                           1.740e-02  1.079e-03  16.125  < 2e-16
number_of_reviews                        -1.422e-04  1.465e-04  -0.971  0.33168
review_scores_cleanliness                 1.581e-01  2.848e-02   5.552 3.24e-08
review_scores_checkin                    -1.049e-02  4.053e-02  -0.259  0.79575
review_scores_communication              -3.262e-02  3.662e-02  -0.891  0.37317
review_scores_location                    2.364e-01  3.370e-02   7.015 3.26e-12
review_scores_value                      -1.805e-01  4.153e-02  -4.346 1.47e-05
instant_bookableTRUE                     -6.339e-02  2.097e-02  -3.022  0.00254
reviews_per_month                         3.668e-04  3.154e-03   0.116  0.90743
Property_TypeEntire loft                  1.546e-01  4.786e-02   3.231  0.00126
Property_TypeEntire rental unit          -9.146e-02  2.903e-02  -3.151  0.00165
Property_TypeOther                       -1.074e-01  3.629e-02  -2.960  0.00312
Property_TypePrivate room in rental unit -2.538e-01  4.982e-02  -5.095 3.86e-07
shared_bathroomTRUE                      -4.897e-01  3.931e-02 -12.458  < 2e-16
                                            
(Intercept)                              ***
host_response_rate                          
host_acceptance_rate                        
host_is_superhostTRUE                    *  
host_listings_count                         
host_verifications                          
host_has_profile_picTRUE                    
host_identity_verifiedTRUE               ** 
accommodates                             ***
bathrooms_text                           ***
bedrooms                                 ***
beds                                        
amenities                                ***
minimum_nights                           *  
maximum_nights                              
availability_30                          ***
number_of_reviews                           
review_scores_cleanliness                ***
review_scores_checkin                       
review_scores_communication                 
review_scores_location                   ***
review_scores_value                      ***
instant_bookableTRUE                     ** 
reviews_per_month                           
Property_TypeEntire loft                 ** 
Property_TypeEntire rental unit          ** 
Property_TypeOther                       ** 
Property_TypePrivate room in rental unit ***
shared_bathroomTRUE                      ***

Residual standard error: 0.3757 on 1778 degrees of freedom
  (2864 observations deleted due to missingness)
Multiple R-squared:  0.5223,    Adjusted R-squared:  0.5148 
F-statistic: 69.44 on 28 and 1778 DF,  p-value: < 2.2e-16
# check colinearity
car::vif(model4)
                                GVIF Df GVIF^(1/(2*Df))
host_response_rate          1.781461  1        1.334714
host_acceptance_rate        2.064661  1        1.436893
host_is_superhost           1.201231  1        1.096007
host_listings_count         1.245382  1        1.115967
host_verifications          1.318957  1        1.148459
host_has_profile_pic        1.043091  1        1.021318
host_identity_verified      1.257192  1        1.121246
accommodates                3.620839  1        1.902850
bathrooms_text              1.332044  1        1.154142
bedrooms                    2.591988  1        1.609965
beds                        3.858717  1        1.964362
amenities                   1.176491  1        1.084662
minimum_nights              1.166882  1        1.080223
maximum_nights              1.041640  1        1.020608
availability_30             1.197091  1        1.094117
number_of_reviews           1.213590  1        1.101631
review_scores_cleanliness   2.513919  1        1.585534
review_scores_checkin       2.944368  1        1.715916
review_scores_communication 2.458054  1        1.567818
review_scores_location      1.731947  1        1.316034
review_scores_value         3.922340  1        1.980490
instant_bookable            1.401035  1        1.183653
reviews_per_month           1.171568  1        1.082390
Property_Type               2.834018  4        1.139070
shared_bathroom             2.483908  1        1.576042
autoplot(model4)

In Model 3, we have identified multiple variables that we considered could have a significant impact on the price. We ran a multivariate regression with 36 variables and log-transformed the dependent variable - price for four nights.

At a 95% confidence level, only 18 variables are statistically significant and the rest have a p-value that is greater than 0.05. Therefore, even though certain variables such as Property_TypePrivate room in rental unit had especially high coefficients, we had to conclude that they were statistically insignificant.

Out of the 18 statistically significant variables, 8 of them are dummy variables where a 0 or 1 has been assigned for a false or true value respectively. This applies to binary scenarios such as whether or not a host’s identity has been verified or if the bathroom is shared or not. For these variables, the percentage change in price is generally quite large at 20+ percent. Because these variables are binary, the true or false values are diametrically opposed and this translates into a huge qualitative difference for consumers. For example, having a shared room vs a private room, or shared bathroom vs private room will have a large difference on what consumers are willing to pay for accommodation.

Conversely, for variables with a spectrum of possible values (such as amenities and accommodates) the percentage change in price is small (0.34% and 3.91% respectively). This is not to say that these factors are not as important to consumers as the binary ones listed above. Because these variables have a relatively large range, a change in accommodates from 1 to 10 will result in a price change of a similar magnitude to one of the binary variables.

In addition to these variables, there were “hybrid” variables whose values were restricted. For example, minimum_nights only ranges from 1 to 4 and the various review_score variables only range from 1 to 5. Having explained the general types of variables in our dataset, there are certain particularly interesting variables. room_typeHotel room was the variable that had the biggest impact on price.

exp⁡〖(β_r )-1〗=exp⁡(.0.2837)-1≈0.328.This indicates that price_4_nights will be 32.8% higher for listings that are hotel rooms versus those that are not. This may be the case because hotel rooms come with various amenities that allow the hotels to charge a higher price relati ve to an equivalent accommodation in a private apartment or house. Property_TypePrivate room in rental unit was also an interesting variable. While it had a higher coefficient than most other variables, its p-score was above 0.05 and it was therefore statistically insignificant.

The interpretation of model 4 is roughly the same except that certain variables have been removed. These variables were ones that were strongly correlated with one another and were therefore influencing each other and not just the price. If these variables were not removed, the change on price would be compounded through certain variables. Removing these variables gives a truer picture of the variables that affect the price.

2.7 Model 5

## host_is_superhostTRUE, host_identity_verifiedTRUE, accommodates, bathrooms_text, bedrooms, amenities, minimum_nights, availability_30, review_scores_cleanliness, 
## review_scores_location, instant_bookableTRUE, shared_bathroomTRUE, prop_type_simplified, review_scores_value

## regression model
model5 <- lm(log_price_4_nights ~ host_is_superhost + host_identity_verified + accommodates + bathrooms_text + bedrooms + amenities + minimum_nights + availability_30 + review_scores_cleanliness + review_scores_location + instant_bookable + shared_bathroom + Property_Type + review_scores_value, data = model_4_training)

mosaic::msummary(model5)
                                           Estimate Std. Error t value Pr(>|t|)
(Intercept)                               4.7737917  0.1039649  45.917  < 2e-16
host_is_superhostTRUE                     0.0484595  0.0173071   2.800 0.005140
host_identity_verifiedTRUE                0.0701380  0.0181880   3.856 0.000117
accommodates                              0.0344684  0.0048837   7.058 2.04e-12
bathrooms_text                            0.1327165  0.0217055   6.114 1.08e-09
bedrooms                                  0.1350339  0.0126579  10.668  < 2e-16
amenities                                 0.0041056  0.0006377   6.438 1.38e-10
minimum_nights                            0.0256327  0.0083888   3.056 0.002264
availability_30                           0.0181162  0.0009055  20.006  < 2e-16
review_scores_cleanliness                 0.1602946  0.0182738   8.772  < 2e-16
review_scores_location                    0.2205028  0.0242824   9.081  < 2e-16
instant_bookableTRUE                     -0.0545918  0.0139822  -3.904 9.63e-05
shared_bathroomTRUE                      -0.3737279  0.0287213 -13.012  < 2e-16
Property_TypeEntire loft                  0.0957053  0.0406231   2.356 0.018533
Property_TypeEntire rental unit          -0.1243023  0.0232481  -5.347 9.55e-08
Property_TypeOther                       -0.1554221  0.0300605  -5.170 2.47e-07
Property_TypePrivate room in rental unit -0.3719136  0.0357592 -10.400  < 2e-16
review_scores_value                      -0.2547412  0.0247732 -10.283  < 2e-16
                                            
(Intercept)                              ***
host_is_superhostTRUE                    ** 
host_identity_verifiedTRUE               ***
accommodates                             ***
bathrooms_text                           ***
bedrooms                                 ***
amenities                                ***
minimum_nights                           ** 
availability_30                          ***
review_scores_cleanliness                ***
review_scores_location                   ***
instant_bookableTRUE                     ***
shared_bathroomTRUE                      ***
Property_TypeEntire loft                 *  
Property_TypeEntire rental unit          ***
Property_TypeOther                       ***
Property_TypePrivate room in rental unit ***
review_scores_value                      ***

Residual standard error: 0.3932 on 3388 degrees of freedom
  (1265 observations deleted due to missingness)
Multiple R-squared:  0.5328,    Adjusted R-squared:  0.5304 
F-statistic: 227.3 on 17 and 3388 DF,  p-value: < 2.2e-16

(log⁡(“price_4_nights” )&=β_0+β_1דhost_is_superhost” +β_2דhost_identity_verified” +β_3דaccommodates” @&“+” β_4×number_of_“bathrooms +” β_5דnumber_of_bedrooms” +β_6דamenities” @&“+” β_7דminimum_nights+” β_8דavailability_30+” β_9דrevew_scores_cleanliness” @&“+” β_10דreview_scores_location+” β_11דinstant_bookable+” β_12דshared_bathroom” @&“+” β_13דentire_loft+” β_14דentire_rental_unit+” β_15דother_property” @&“+” β_16דprivate_room_in_rental_unit+” β_17דreview_scores_value”

First we check p_value for all explanatory variables and found that all are statistically significant at 95% confidence level as their p_value all fall below 0.05.

Next we make the exponentiation transformation for all the coefficients to determine which ones are practically significant based on their value and feasible ranges. In our findings, cost of 4 nights will increase by more than 10% with one unit increase in number_of_bathroom, number_of_bedroom, review_scores_cleanliness, review_scores_location variables, and will also holds if the property type falls into entire_loft. Moreover, cost of 4 nights is expected to decrease by more than 10% if property type is entire_rental_unit or other_property, and more than 25% if it is shared_bathroom or its property type being private_room_in_rental_unit.

Host profile predictors such as host_is_superhost, host_identity_verified variables seem to be less significant and have weaker effect on the costs of 4 nights, as compared to property related predictors.

Variables such as accommodates and availability_30, which are indictors for room size and short-term demand, seem to have very weak effect compared to other groups of predictors.

Surprisingly, we have captured some unexpected result. For example, according to our regression result, it is expected that cost of 4 nights will decrease by more than 20% for every unit increase in review_scores_value. This could be due to the correlation between the different review scores variables, as both location and cleanliness scores contribute to the final overview scores.

# check colinearity
car::vif(model5)
                              GVIF Df GVIF^(1/(2*Df))
host_is_superhost         1.134344  1        1.065056
host_identity_verified    1.057795  1        1.028492
accommodates              2.354902  1        1.534569
bathrooms_text            1.283881  1        1.133085
bedrooms                  2.298298  1        1.516014
amenities                 1.221802  1        1.105352
minimum_nights            1.102533  1        1.050016
availability_30           1.186263  1        1.089157
review_scores_cleanliness 2.128548  1        1.458954
review_scores_location    1.556779  1        1.247710
instant_bookable          1.058423  1        1.028797
shared_bathroom           2.685866  1        1.638861
Property_Type             2.967452  4        1.145639
review_scores_value       2.693587  1        1.641215
autoplot(model5)

Strong colinearity was not detected between variables after running the VIF

2.8 Model Comparison

  1. Create a summary table, using huxtable (https://mfa2022.netlify.app/example/modelling_side_by_side_tables/) that shows which models you worked on, which predictors are significant, the adjusted \(R^2\), and the Residual Standard Error.
huxreg(model1, model2, model3, model4,model5,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma'), 
#       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of models')
Comparison of models
(1)(2)(3)(4)(5)
(Intercept)6.090 6.106 4.625 4.525 4.774 
(0.050)(0.048)(0.188)(0.189)(0.104)
Property_TypeEntire loft0.012 0.012 0.148 0.155 0.096 
(0.043)(0.042)(0.047)(0.048)(0.041)
Property_TypeEntire rental unit-0.154 -0.155 -0.081 -0.091 -0.124 
(0.028)(0.027)(0.029)(0.029)(0.023)
Property_TypeOther-0.185 0.181 -0.014 -0.107 -0.155 
(0.034)(0.042)(0.045)(0.036)(0.030)
Property_TypePrivate room in rental unit-0.839 -0.169 -0.107 -0.254 -0.372 
(0.033)(0.054)(0.061)(0.050)(0.036)
number_of_reviews0.001 0.001 0.000 -0.000      
(0.000)(0.000)(0.000)(0.000)     
review_scores_rating0.012 0.008 0.190           
(0.009)(0.009)(0.055)          
room_typeHotel room     0.284 0.284           
     (0.094)(0.096)          
room_typePrivate room     -0.671 -0.250           
     (0.044)(0.053)          
room_typeShared room     -0.831 -0.376           
     (0.164)(0.166)          
host_response_rate          0.010 0.006      
          (0.054)(0.054)     
host_acceptance_rate          0.008 0.009      
          (0.044)(0.044)     
host_is_superhostTRUE          0.038 0.048 0.048 
          (0.021)(0.021)(0.017)
host_listings_count          0.000 0.001      
          (0.001)(0.001)     
host_verifications          0.004 0.006      
          (0.005)(0.005)     
host_has_profile_picTRUE          0.046 0.029      
          (0.115)(0.116)     
host_identity_verifiedTRUE          0.102 0.100 0.070 
          (0.033)(0.034)(0.018)
accommodates          0.038 0.042 0.034 
          (0.007)(0.007)(0.005)
bathrooms_text          0.119 0.122 0.133 
          (0.026)(0.026)(0.022)
bedrooms          0.105 0.118 0.135 
          (0.016)(0.016)(0.013)
beds          -0.010 -0.016      
          (0.013)(0.013)     
amenities          0.003 0.004 0.004 
          (0.001)(0.001)(0.001)
minimum_nights          0.025 0.030 0.026 
          (0.012)(0.012)(0.008)
maximum_nights          0.000 0.000      
          (0.000)(0.000)     
availability_30          0.016 0.017 0.018 
          (0.002)(0.001)(0.001)
availability_60          0.001           
          (0.001)          
number_of_reviews_ltm          -0.002           
          (0.001)          
number_of_reviews_l30d          0.003           
          (0.005)          
review_scores_accuracy          -0.012           
          (0.048)          
review_scores_cleanliness          0.101 0.158 0.160 
          (0.033)(0.028)(0.018)
review_scores_checkin          -0.029 -0.010      
          (0.041)(0.041)     
review_scores_communication          -0.074 -0.033      
          (0.039)(0.037)     
review_scores_location          0.232 0.236 0.221 
          (0.034)(0.034)(0.024)
review_scores_value          -0.254 -0.180 -0.255 
          (0.050)(0.042)(0.025)
instant_bookableTRUE          -0.049 -0.063 -0.055 
          (0.021)(0.021)(0.014)
reviews_per_month          0.002 0.000      
          (0.003)(0.003)     
shared_bathroomTRUE          -0.374 -0.490 -0.374 
          (0.044)(0.039)(0.029)
#observations3922     3922     1807     1807     3406     
R squared0.222 0.277 0.538 0.522 0.533 
Adj. R Squared0.221 0.275 0.528 0.515 0.530 
Residual SE0.499 0.481 0.370 0.376 0.393 

Overall, we find that model 5 is a relatively better model compared to the previous ones. The overall model is statistically significant according to F-stat and it has slightly better goodness of fit based on the adjusted R-square value, even with a reduced number of explanatory variables. Model 5 yields the highest adjusted R-square value with fewer explanatory variables and more observations, thus reflecting highest model fit and accuracy.

2.9 Forecasting

Suppose you are planning to visit the city you have been assigned to over reading week, and you want to stay in an Airbnb. Find Airbnb’s in your destination city that are apartments with a private room, have at least 10 reviews, and an average rating of at least 90. Use your best model to predict the total cost to stay at this Airbnb for 4 nights. Include the appropriate 95% interval with your prediction. Report the point prediction and interval in terms of price_4_nights.

# Filter out the situation matched with the scenario
# Run prediction on the airbnbn_test dataset
airbnb_test1 <- airbnb_test %>% 
  filter(number_of_reviews >= 10) %>% 
  filter(review_scores_rating >=4.5) %>% 
  filter(room_type =="Private room") %>% 
  mutate(predictions =predict(model5,.))

# Sift out one sample "https://www.airbnb.com/rooms/31372144"
# Include appropriate 95% interval with prediction compared with the actual price for 4nights

airbnb2 <- airbnb_test1 %>% 
  filter(listing_url == "https://www.airbnb.com/rooms/31372144") %>% 
  mutate(predicted_price = exp(predictions),
            sd=sd(airbnb_train$log_price_4_nights),
            count=4671, #total number of observations in airbnb_test
            t_critical=qt(0.975,count-1),
            se=sd/sqrt(count),
            margin_of_error=t_critical*se) %>% 
  summarise(listing_url,
            predicted_price,
            lower=exp(predictions-margin_of_error),
            upper=exp(predictions+margin_of_error),
            price_4_nights) %>% 
  print()
# A tibble: 1 x 5
  listing_url                           predicted_price lower upper price_4_nights
  <chr>                                           <dbl> <dbl> <dbl>          <dbl>
1 https://www.airbnb.com/rooms/31372144            223.  219.  227.            208

With our prediction model, we are 95% confident that cost of 4 nights for 2 people in Montreal is within (219,227) price range. Unfortunately the random sample we was out of this range. This is reality (https://www.airbnb.com/rooms/31372144)

3 Acknowledgements